........................... RAW POLLS DATASET VISUALIZATIONS ...........................

Importing Dependencies
In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import warnings

warnings.filterwarnings('ignore')
Reading data
In [2]:
total_df = pd.read_excel("RAW POLLS.xlsx")
In [3]:
total_df
Out[3]:
pollno race year location type_simple type_detail pollster partisan polldate samplesize ... margin_poll electiondate cand1_actual cand2_actual margin_actual error bias rightcall comment Number
0 5380001 1998_House-G_ID-1 1998 ID-1 House-G House-G Garin-Hart-Yang Research Group D 10/13/98 372.0 ... 2.0 1998-03-11 00:00:00 44.73 55.27 -10.54 12.54 12.54 0.0 None 1
1 5380002 1998_House-G_ID-2 1998 ID-2 House-G House-G Garin-Hart-Yang Research Group D 10/13/98 400.0 ... 6.0 1998-03-11 00:00:00 44.69 52.51 -7.82 13.82 13.82 0.0 None 1
2 5380003 1998_House-G_US 1998 US House-G House-G Zogby Interactive/JZ Analytics None 10/13/98 864.0 ... 3.0 1998-03-11 00:00:00 46.42 47.99 -1.57 4.57 4.57 0.0 previously listed as Zogby Analytics, telephone 1
3 5380004 1998_Sen-G_NV 1998 NV Sen-G Sen-G Fairbank, Maslin, Maullin, Metz & Associates D 10/13/98 1418.0 ... 5.0 1998-03-11 00:00:00 47.88 47.78 0.10 4.90 4.9 1.0 None 1
4 5380005 1998_Sen-G_NY 1998 NY Sen-G Sen-G Blum & Weprin Associates None 10/13/98 364.0 ... -8.0 1998-03-11 00:00:00 54.62 44.08 10.54 18.54 -18.54 0.0 None 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23098 5386610 2013_Gov-G_VA 2013 VA Gov-G Gov-G Public Policy Polling None 2013-03-11 00:00:00 870.0 ... 7.0 2013-05-11 00:00:00 47.75 45.23 2.52 4.48 4.48 1.0 None 3
23099 5386611 2014_House-GS_FL-13 2014 FL-13 House-G House-GS Fabrizio, Lee & Associates None 2/18/2014 400.0 ... -2.0 2014-11-03 00:00:00 46.55 48.43 -1.88 0.12 -0.12 1.0 for Chamber of Commerce 3
23100 5386612 2014_House-GS_FL-13 2014 FL-13 House-G House-GS StPetePolls.org None 2/25/2014 1269.0 ... 0.0 2014-11-03 00:00:00 46.55 48.43 -1.88 1.88 1.88 0.5 aka Flextel, Inc. 3
23101 5386613 2014_House-GS_FL-13 2014 FL-13 House-G House-GS Political Marketing International, Inc./Red Ra... None 2/26/2014 391.0 ... -2.0 2014-11-03 00:00:00 46.55 48.43 -1.88 0.12 -0.12 1.0 None 3
23102 5386614 2014_House-GS_FL-13 2014 FL-13 House-G House-GS Public Policy Polling None 2014-08-03 00:00:00 702.0 ... 3.0 2014-11-03 00:00:00 46.55 48.43 -1.88 4.88 4.88 0.0 for League of Conservation Voters 3

23103 rows × 25 columns

Exploratory Data Analysis
In [4]:
total_df.columns
Out[4]:
Index(['pollno', 'race', 'year', 'location', 'type_simple', 'type_detail',
       'pollster', 'partisan', 'polldate', 'samplesize', 'cand1_name',
       'cand1_pct', 'cand2_name', 'cand2_pct', 'cand3_pct', 'margin_poll',
       'electiondate', 'cand1_actual', 'cand2_actual', 'margin_actual',
       'error', 'bias', 'rightcall', 'comment', 'Number'],
      dtype='object')
In [5]:
total_df.isnull().sum()
Out[5]:
pollno           0
race             0
year             0
location         0
type_simple      0
type_detail      0
pollster         0
partisan         0
polldate         0
samplesize       0
cand1_name       0
cand1_pct        0
cand2_name       0
cand2_pct        0
cand3_pct        0
margin_poll      0
electiondate     0
cand1_actual     0
cand2_actual     0
margin_actual    0
error            0
bias             0
rightcall        0
comment          0
Number           0
dtype: int64
In [6]:
total_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23103 entries, 0 to 23102
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pollno         23103 non-null  int64  
 1   race           23103 non-null  object 
 2   year           23103 non-null  int64  
 3   location       23103 non-null  object 
 4   type_simple    23103 non-null  object 
 5   type_detail    23103 non-null  object 
 6   pollster       23103 non-null  object 
 7   partisan       23103 non-null  object 
 8   polldate       23103 non-null  object 
 9   samplesize     23103 non-null  float64
 10  cand1_name     23103 non-null  object 
 11  cand1_pct      23103 non-null  float64
 12  cand2_name     23103 non-null  object 
 13  cand2_pct      23103 non-null  float64
 14  cand3_pct      23103 non-null  object 
 15  margin_poll    23103 non-null  float64
 16  electiondate   23103 non-null  object 
 17  cand1_actual   23103 non-null  float64
 18  cand2_actual   23103 non-null  float64
 19  margin_actual  23103 non-null  float64
 20  error          23103 non-null  float64
 21  bias           23103 non-null  object 
 22  rightcall      23103 non-null  float64
 23  comment        23103 non-null  object 
 24  Number         23103 non-null  int64  
dtypes: float64(9), int64(3), object(13)
memory usage: 4.4+ MB
In [7]:
total_df.nunique()
Out[7]:
pollno           8908
race             1865
year               21
location          442
type_simple         5
type_detail        18
pollster          463
partisan            4
polldate         1220
samplesize       1530
cand1_name         50
cand1_pct         323
cand2_name         56
cand2_pct         328
cand3_pct         139
margin_poll       418
electiondate      148
cand1_actual     1882
cand2_actual     1902
margin_actual    2051
error            3357
bias             3984
rightcall           3
comment           225
Number              3
dtype: int64
In [8]:
total_df.describe()
Out[8]:
pollno year samplesize cand1_pct cand2_pct margin_poll cand1_actual cand2_actual margin_actual error rightcall Number
count 2.310300e+04 23103.000000 23103.000000 23103.000000 23103.000000 23103.000000 23103.000000 23103.000000 23103.000000 23103.000000 23103.000000 23103.000000
mean 6.194403e+06 2007.632862 828.886725 44.640533 42.655040 1.985493 48.461471 45.724205 2.737219 5.462165 0.818617 1.917846
std 3.275895e+06 4.978101 1963.709678 8.911051 9.880802 15.010319 8.449223 9.645330 15.702886 4.874776 0.373029 0.804984
min 7.369900e+04 1998.000000 25.000000 3.000000 4.000000 -80.000000 5.400000 4.090000 -84.100000 0.000000 0.000000 1.000000
25% 5.381619e+06 2004.000000 500.000000 40.000000 38.000000 -5.000000 44.070000 41.300000 -5.010000 1.990000 1.000000 1.000000
50% 5.383548e+06 2008.000000 625.000000 45.900000 44.000000 2.000000 48.840000 46.960000 2.980000 4.210000 1.000000 2.000000
75% 5.385475e+06 2012.000000 828.000000 50.000000 48.000000 9.000000 53.150000 51.010000 11.040000 7.510000 1.000000 3.000000
max 2.538000e+07 2018.000000 134000.000000 88.000000 87.000000 84.000000 90.860000 89.500000 86.780000 50.980000 1.000000 3.000000
1. What is the maximum counts in these columns(year, type_simple, partisan, type_detail) ?
In [9]:
sns.set(rc={'axes.facecolor':'w', 'figure.facecolor':'gold'})

fig, axes = plt.subplots(4,figsize=(18,25))

sns.countplot(total_df["year"],ax = axes[0],palette="Set1")
sns.countplot(total_df["type_simple"],ax = axes[1],palette="Set1")
sns.countplot(total_df["partisan"],ax = axes[2],palette="Set1")
sns.countplot(total_df["type_detail"],ax = axes[3],palette="Set1")

plt.xticks(color = "black",rotation=90)
plt.show()
In [10]:
from plotly.subplots import make_subplots

labels1 = ['1998','1999','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012',
         '2013','2014','2015','2016','2017','2018']
values1 = [801,12,2525,39,1232,84,2519,57,1872,24,4787,135,2381,51,3140,141,1423,38,1768,67,7]

labels2 = ['House-G','Sen-G','Gov-G','Pres-P','Pres-G']
values2 = [3662,5442,3766,4395,5838]


fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels=labels1, values=values1, name="YEAR"),1, 1)

fig.add_trace(go.Pie(labels=labels2, values=values2, name="TYPE SIMPLE"),1, 2)

fig.update_traces(hole=.5, hoverinfo="label+percent+name",textposition='inside')

fig.update_layout({'plot_bgcolor': 'rgb(240,230,140)','paper_bgcolor': 'rgb(154,205,50)'},font=dict(color="white"),
                  title_text="YEAR and TYPE SIMPLE",legend_title="YEAR",
                  
    annotations=[dict(text='YEAR', x=0.18, y=0.5, font_size=20, showarrow=False),
                 dict(text='TYPE SIMPLE', x=0.86, y=0.5, font_size=20, showarrow=False)])

fig.show()
In [11]:
from plotly.subplots import make_subplots

labels3 = ["D","None","R","I"]
values3 = [475,22223,402,3]

labels4 = ['House-G','Sen-G','Gov-G','Gov-O','Pres-D','Pres-R','Pres-G','Sen-O','Sen-GR','Gov-GR','House-GS','Pres-RC','Sen-GS',
          'House-OS','Gov-GS','House-GR','House-O','House-GSR']
values4 = [3464,5130,3651,72,2211,2181,5838,43,64,37,107,2,205,4,6,65,1,10]


fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels=labels3, values=values3, name="TYPE SIMPLE"),1, 1)

fig.add_trace(go.Pie(labels=labels4, values=values4, name="TYPE SIMPLE"),1, 2)


    
fig.update_traces(hole=.5, hoverinfo="label+percent+name",textposition='inside')

fig.update_layout({'plot_bgcolor': 'rgb(240,230,140)','paper_bgcolor': 'rgb(128,0,128)'},font=dict(color="white")
                  ,title_text="PARTISAN and TYPE DETAIL",legend_title="TYPE DETAIL",
    annotations=[dict(text='PARTISAN', x=0.15, y=0.5, font_size=20, showarrow=False),
                dict(text='TYPE DETAIL', x=0.87, y=0.5, font_size=20, showarrow=False)],)

fig.show()
In [12]:
fig = px.histogram(total_df["year"],color_discrete_sequence=["crimson"],title="YEAR count")


fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(119,136,153)'},height = 400,width = 980,
                  font=dict(color="white"))

fig.show()
In [13]:
fig = px.histogram(total_df["type_simple"],color_discrete_sequence=["orange"],title="TYPE SIMPLE count")

fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(0,0,0)'},height = 400,width = 980,
                  font=dict(color="white"))

fig.show()
In [14]:
fig = px.histogram(total_df["type_detail"],color_discrete_sequence=["green"],title="TYPE DETAIL count")
fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(255,69,0)'},height = 400,width = 980,
                  font=dict(color="white"))

fig.show()
In [15]:
fig = px.histogram(total_df["partisan"],color_discrete_sequence=["darkblue"],title="PARTISAN count")

fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(255,255,0)'},height = 400,width = 980,
                  font=dict(color="black"))

fig.show()
OBSERVATION From these plottings In Partisan Democrat had maximum count(475), In Type Detail,Type Simple "Pres-G" had maximum count (5838), Year "2008" had maximum count(20%).

---------------------------------------------------------------------------------------------------------------

2. Which pollster is most participated in which year of Race ?
In [16]:
fig = px.pie(total_df, values='year', names='pollster',hover_data=["race","year","location"],
             color_discrete_sequence=px.colors.sequential.RdBu,
             title='POLLSTER RACE YEAR')
fig.update_traces(textposition='inside',hole = .4)  
fig.update_layout({'paper_bgcolor': 'rgba(222, 993, 1000, 200)'},uniformtext_minsize=10,legend_title="POLLSTER", 
                  uniformtext_mode='hide',font=dict(color="Blue"))
fig.show()
OBSERVATION From these plottings In year '2000'--> "Rasmussen Reports/Pulse Opinion Research" pollster got highest pollrate(8.62%) , for race --> 2000_Pres-R_SC, Lacation --> 2000_Gov-G_WV.

--------------------------------------------------------------------------------------------------------------

3. What is Year wise participation in Type simple, Type Detail ?
In [17]:
fig = px.sunburst(total_df, path=['type_simple','year'],hover_data=['type_simple','year'],color='year',)

fig.update_layout({'paper_bgcolor': 'rgb(0,0,0)'},title="YEAR wise count of TYPE SIMPLE",
                 font=dict(color="white"),margin = dict(t=100, l=100, r=0, b=0))
fig.show()
In [18]:
fig = px.sunburst(total_df, path=['type_detail','year'],hover_data=['type_detail','year'],color='year',)

fig.update_layout({'paper_bgcolor': 'rgb(0,0,0)'},title="YEAR wise count of TYPE DETAIL",
                 font=dict(color="white"),margin = dict(t=100, l=100, r=0, b=0))
fig.show()

Both Type_Simple & Type_Detail

In [19]:
fig = px.sunburst(total_df, path=['type_simple', 'type_detail',"year"], values="year",
                  hover_data=['type_simple', 'type_detail'],color = "year")
fig.show()
OBSERVATION From these plottings In Type-Simple "Gov-G" and "House-G" ,In Type-Detail "Sen-G" and "Gov-G" are more participated from 1998 - 2016. In Type-Simple,Type-Detail "Pres-P" is more particpated in 2008

-------------------------------------------------------------------------------------------------------------

4. Which pollster has got highest "margin_actual" and highest Negative?
In [20]:
fig = px.pie(total_df, values='margin_actual', names='pollster', title='POLLSTER vs MARGINAL_ACTUAL',
            color_discrete_sequence=px.colors.sequential.RdBu,)

fig.update_traces(textposition='inside',hole = .4)  
fig.update_layout({'plot_bgcolor': 'rgb(245,255,250)','paper_bgcolor': 'rgb(255,215,0)'},uniformtext_minsize=10, 
                  uniformtext_mode='hide',font=dict(color="black"))

fig.show()
In [21]:
fig = px.histogram(total_df,x="pollster",y="margin_actual",color="pollster",title='POLLSTER vs MARGINAL_ACTUAL')

fig.update_traces(marker=dict(size=15,line=dict(width=1,color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.update_layout({'plot_bgcolor': 'rgb(245,255,250)','paper_bgcolor': 'rgb(128,128,0)'}
                 ,font=dict(color="white"),xaxis=dict(showticklabels=False)
                 ,width = 980,height = 500,showlegend = False)
fig.show()
In [22]:
fig = px.scatter(total_df,x="pollster",y="margin_actual",color="pollster",title='POLLSTER vs MARGINAL_ACTUAL')

fig.update_traces(marker=dict(size=15,line=dict(width=1,color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.update_layout({'plot_bgcolor': 'rgb(245,255,250)','paper_bgcolor': 'rgb(244,164,96)'}
                 ,font=dict(color="black"),xaxis=dict(showticklabels=False)
                 ,width = 980,height = 500,showlegend = False)
fig.show()
OBSERVATION From these plottings pollster --> "YouGov" had highest "Margin_actual" = 13,464,29171 and pollster --> "Dan Jones & Associates" had highest "Margin actual negative" = -2,348,659.

--------------------------------------------------------------------------------------------------------------

5. What is candidates 1 percentages and which Candidate got highest ?
In [23]:
fig = px.scatter(total_df, x="cand1_name",y= "cand1_pct",size="cand1_pct",
                 color="cand1_name", color_continuous_scale=px.colors.sequential.Viridis,
                title='CANDIDATE 1 & CANDIDATE 1 PERCENTAGE')

fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(0,250,154)'},showlegend=False,
                 uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="black"),
                 xaxis={'categoryorder':'category ascending'},width = 980,height = 600)
                

fig.show()
In [24]:
fig = px.box(total_df, x="cand1_name",y= "cand1_pct",color="cand1_name",title='CANDIDATE 1 & CANDIDATE 1 PERCENTAGE')

fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(139,69,19)'},showlegend=False,
                 uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="white"),
                 xaxis={'categoryorder':'category ascending'},width = 980,height = 600)
                

fig.show()
In [25]:
fig = px.density_contour(total_df, x="cand1_name",y= "cand1_pct",color="cand1_name",
                        title='CANDIDATE 1 & CANDIDATE 1 PERCENTAGE')
fig.update_traces(line_width=1.5)
fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(199,21,133)'},showlegend=False,
                 uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="white"),
                 xaxis={'categoryorder':'category ascending'},width = 980,height = 600)

fig.show()
In [26]:
fig = px.pie(total_df, values='cand1_pct', names='cand1_name',
             hover_data=["race","year","location"],
             color_discrete_sequence=px.colors.sequential.RdBu,
             title='CANDIDATE 1 & CANDIDATE 1 PERCENTAGE')
fig.update_traces(textposition='inside',hole = .2)  
fig.update_layout({'paper_bgcolor': 'rgba(222, 993, 1000, 200)'},uniformtext_minsize=10,legend_title="CANDIDATE NAMES", 
                  uniformtext_mode='hide',font=dict(color="Blue"))
fig.show()
In [27]:
fig = px.sunburst(total_df, path=['cand1_name','year'], values='cand1_pct', color='cand1_name'
                  ,title='CANDIDATE 1 & CANDIDATE 1 PERCENTAGE')

fig.update_layout({'paper_bgcolor': 'rgb(255,255,0)'},
                 font=dict(color="black"),margin = dict(t=50, l=100, r=0, b=0))

fig.show()
OBSERVATION From these plottings In Candidates 1 "Democrat" got highest Percentage 88% , second highest "Obama" and "Democrat" candidate is participated from (1998 - 2017) and this "Democrat" had outliers.

-------------------------------------------------------------------------------------------------------

6. What is candidates 2 percentages and which Candidate got highest ?
In [28]:
fig = px.scatter(total_df, x="cand2_name",y= "cand2_pct",size="cand2_pct",
                 color="cand2_name", color_continuous_scale=px.colors.sequential.Viridis,
                title='CANDIDATE 2 & CANDIDATE 2 PERCENTAGE')

fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(0,0,0)'},showlegend=False,
                 uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="white"),
                 xaxis={'categoryorder':'category ascending'},width = 980,height = 600)

fig.show()
In [29]:
fig = px.density_contour(total_df, x="cand2_name",y= "cand2_pct",color="cand2_name",
                        title='CANDIDATE 2 & CANDIDATE 2 PERCENTAGE')

fig.update_traces(line_width=1.5)

fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(189,183,107)'},showlegend=False,
                 uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="black"),
                 xaxis={'categoryorder':'category ascending'},width = 980,height = 550)

fig.show()
In [30]:
fig = px.box(total_df, x="cand2_name",y= "cand2_pct",color="cand2_name",
            title='CANDIDATE 2 & CANDIDATE 2 PERCENTAGE')

fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(128,128,0)'},showlegend=False,
                 uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="white"),
                 xaxis={'categoryorder':'category ascending'},width = 980,height = 600)
                

fig.show()
In [31]:
fig = px.pie(total_df, values='cand2_pct', names='cand2_name',hover_data=["race","year","location"],
             color_discrete_sequence=px.colors.sequential.Jet,
             title='CANDIDATE 2 & CANDIDATE 2 PERCENTAGE')

fig.update_traces(textposition='inside',hole = .2) 

fig.update_layout({'paper_bgcolor': 'rgb(255,140,0)'},uniformtext_minsize=10,legend_title="CANDIDATE NAMES", 
                  uniformtext_mode='hide',font=dict(color="black"))
fig.show()
In [32]:
fig = px.sunburst(total_df, path=['cand2_name','year'], values='cand2_pct', color='cand2_name'
                  ,title='CANDIDATE 2 & CANDIDATE 2 PERCENTAGE')

fig.update_layout({'paper_bgcolor': 'rgb(220,220,220)'},font=dict(color="black"),margin = dict(t=50, l=100, r=0, b=0))

fig.show()
OBSERVATION From these plottings In Candidates 2 "Republican" got highest Percentage 84% , second highest "Clinton" and "Republican" candidate is participated from (1998 - 2017) and this "Republican" had outliers.

---------------------------------------------------------------------------------------------------------------

7. What is Candidates 1 Actual Percentages and which Candidate got highest ?
In [33]:
fig = px.scatter(total_df, x="cand1_name",y= "cand1_actual",size="cand1_actual",
                 color="cand1_name", color_continuous_scale=px.colors.sequential.Viridis,
                title='CANDIDATE 1 & CANDIDATE 1 ACTUAL')

fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(0,250,154)'},showlegend=False,
                 uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="black"),
                 xaxis={'categoryorder':'category ascending'},width = 980,height = 600)
                

fig.show()
In [34]:
fig = px.box(total_df, x="cand1_name",y= "cand1_actual",color="cand1_name",
            title='CANDIDATE 1 & CANDIDATE 1 ACTUAL')

fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(128,128,0)'},showlegend=False,
                 uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="white"),
                 xaxis={'categoryorder':'category ascending'},width = 980,height = 600)
                

fig.show()
In [35]:
fig = px.pie(total_df, values='cand1_actual', names='cand1_name',hover_data=["race","year","location"],
             color_discrete_sequence=px.colors.sequential.Jet,
             title='CANDIDATE 1 & CANDIDATE 1 ACTUAL')

fig.update_traces(textposition='inside',hole = .2) 

fig.update_layout({'paper_bgcolor': 'rgb(255,140,0)'},uniformtext_minsize=10,legend_title="CANDIDATE NAMES", 
                  uniformtext_mode='hide',font=dict(color="black"))
fig.show()
In [36]:
fig = px.sunburst(total_df, path=['cand1_name','year'], values='cand1_actual', color='cand1_name'
                  ,title='CANDIDATE 1 & CANDIDATE 1 ACTUAL')

fig.update_layout({'paper_bgcolor': 'rgb(220,220,220)'},font=dict(color="black"),margin = dict(t=50, l=100, r=0, b=0))

fig.show()
OBSERVATION From these plottings In Candidates 1 "Democrat" got highest Actual Percentage 79% , second highest "Obama" and "Democrat" candidate is participated from (1998 - 2017) and this "Democrat" had outliers.

---------------------------------------------------------------------------------------------------------------

8. What is Candidates 2 Actual Percentages and which Candidate got highest ?
In [37]:
fig = px.scatter(total_df, x="cand2_name",y= "cand2_actual",size="cand2_actual",
                 color="cand2_name", color_continuous_scale=px.colors.sequential.Viridis,
                title='CANDIDATE 2 & CANDIDATE 2 ACTUAL')

fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(0,250,154)'},showlegend=False,
                 uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="black"),
                 xaxis={'categoryorder':'category ascending'},width = 980,height = 600)
                

fig.show()
In [38]:
fig = px.box(total_df, x="cand2_name",y= "cand2_actual",color="cand2_name",
            title='CANDIDATE 2 & CANDIDATE 2 ACTUAL')

fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(128,128,0)'},showlegend=False,
                 uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="white"),
                 xaxis={'categoryorder':'category ascending'},width = 980,height = 600)
                

fig.show()
In [39]:
fig = px.pie(total_df, values='cand2_actual', names='cand2_name',hover_data=["race","year","location"],
             color_discrete_sequence=px.colors.sequential.Jet,
             title='CANDIDATE 2 & CANDIDATE 2 ACTUAL')

fig.update_traces(textposition='inside',hole = .2) 

fig.update_layout({'paper_bgcolor': 'rgb(255,140,0)'},uniformtext_minsize=10,legend_title="CANDIDATE NAMES", 
                  uniformtext_mode='hide',font=dict(color="black"))
fig.show()
In [40]:
fig = px.sunburst(total_df, path=['cand2_name','year'], values='cand2_actual', color='cand2_name'
                  ,title='CANDIDATE 2 & CANDIDATE 2 ACTUAL')

fig.update_layout({'paper_bgcolor': 'rgb(220,220,220)'},font=dict(color="black"),margin = dict(t=50, l=100, r=0, b=0))

fig.show()
OBSERVATION From these plottings In Candidates 2 "Republican" got highest Actual Percentage 83% , second highest "Clinton" and "Republican" candidate is participated from (1998 - 2017) and this "Republican" had outliers.

--------------------------------------------------------------------------------------------------------------

9. Comparision between Candidate 1 Percentages & Actual Percentages ?
In [41]:
fig = px.scatter(total_df, x="cand1_actual", y="cand1_pct", animation_frame="year",animation_group="cand1_name",
           size="cand1_actual", color="cand1_name", hover_name="pollster",
            size_max=60,title="CANDIDATE 1 ACTUAL & CANDIDATE 1 PERCENTAAGE")

fig.update_layout({'plot_bgcolor': 'rgb(240,255,255)','paper_bgcolor': 'rgb(169,169,169)'},
                  uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="white"))

fig.show()
In [42]:
fig = px.scatter(total_df, x="cand1_pct",y="cand1_actual",
                 trendline="ols",size_max=10,size = "cand1_actual",
                 color="cand1_name",title="CANDIDATE 1 ACTUAL & CANDIDATE 1 PERCENTAAGE")

fig.update_layout({'plot_bgcolor': 'rgba(111, 882, 999, 100)','paper_bgcolor': 'rgba(222, 993, 1000, 200)'},
                  uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="black"))
                
fig.show()
In [43]:
fig = px.density_contour(total_df, x="cand1_pct", y="cand1_actual",
                         title="CANDIDATE 1 ACTUAL & CANDIDATE 1 PERCENTAAGE",
                         marginal_x="histogram", marginal_y="histogram")

fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(255,20,147)'},
                  uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="white"),
                 xaxis={'categoryorder':'category ascending'},width = 950)

fig.show()
In [44]:
fig = px.density_heatmap(total_df, x="cand1_actual", y="cand1_pct", animation_frame="year",
                        marginal_x="box", marginal_y="histogram",hover_data=['year','cand1_name'],
                        title="CANDIDATE 1 ACTUAL & CANDIDATE 1 PERCENTAAGE")

fig.update_layout({'paper_bgcolor': 'rgb(0,255,255)'},
                  uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="black"),
                 xaxis={'categoryorder':'category ascending'},width = 1000)
 
fig.show()
OBSERVATION From these plottings Density of maximum Candidate 1 percentage And Candidate 1 Actual Percentage difference "2".

---------------------------------------------------------------------------------------------------------------

10 Comparision between Candidate 2 Percentages & Actual Percentages ?
In [45]:
fig = px.scatter(total_df, x="cand2_actual", y="cand2_pct", animation_frame="year",animation_group="cand2_name",
           size="cand2_actual", color="cand2_name", hover_name="pollster",
            size_max=60,title="CANDIDATE 2 ACTUAL & CANDIDATE 2 PERCENTAAGE")

fig.update_layout({'plot_bgcolor': 'rgb(255,218,185)','paper_bgcolor': 'rgb(255,239,213)'},
                  uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="black"))

fig.show()
In [46]:
fig = px.scatter(total_df, x="cand2_pct",y="cand2_actual",
                 trendline="ols",size_max=10,size = "cand2_actual",
                 color="cand2_name",title="CANDIDATE 2 ACTUAL & CANDIDATE 2 PERCENTAAGE")

fig.update_layout({'plot_bgcolor': 'rgb(253,245,230)','paper_bgcolor': 'rgb(188,143,143)'},
                  uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="white"))
                
fig.show()
In [47]:
fig = px.density_contour(total_df, x="cand2_pct", y="cand2_actual",
                         title="CANDIDATE 2 ACTUAL & CANDIDATE 2 PERCENTAAGE",
                         marginal_x="histogram", marginal_y="histogram")

fig.update_layout({'plot_bgcolor': 'rgb(255,255,255)','paper_bgcolor': 'rgb(255,192,203)'},
                  uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="black"),
                 xaxis={'categoryorder':'category ascending'},width = 950)

fig.show()
In [48]:
fig = px.density_heatmap(total_df, x="cand2_actual", y="cand2_pct", animation_frame="year",
                        marginal_x="box", marginal_y="histogram",hover_data=['year','cand2_name'],
                        title="CANDIDATE 2 ACTUAL & CANDIDATE 2 PERCENTAAGE")

fig.update_layout({'paper_bgcolor': 'rgb(255,250,205)'},
                  uniformtext_minsize=8, uniformtext_mode='hide',font=dict(color="black"),
                 xaxis={'categoryorder':'category ascending'},width = 1000)
 
fig.show()
OBSERVATION From these plottings Density of maximum Candidate 2 percentage And Candidate 2 Actual Percentage difference is less.(ie) 0.5

-------------------------------------------------------------------------------------------------------

11. Comparision between Candidate 1,2,3 Percentages with Candidate 1,2 name and Candidate Actual Percentages, Margin Actual?
In [49]:
fig = px.scatter_3d(total_df, x='cand1_pct', y='cand2_pct', z='cand3_pct',color = "cand1_name",
              symbol='cand1_name', opacity=0.7,title="CANDIDATE 1,2,3 PAERCENTAGES")

fig.update_traces(marker=dict(size=8,
                              line=dict(width=1,
                                        color='DarkGrey')),
                  selector=dict(mode='markers'))

fig.update_layout(showlegend = True)
fig.show()
In [50]:
fig = px.scatter_3d(total_df, x='cand1_pct', y='cand2_pct', z='cand3_pct',color = "cand2_name",
              symbol='cand2_name', opacity=0.5,title="CANDIDATE 1,2,3 PAERCENTAGES")
fig.update_traces(marker=dict(size=8,
                              line=dict(width=1,
                                        color='DarkGrey')),
                  selector=dict(mode='markers'))
fig.update_layout(showlegend = True)
fig.show()
In [51]:
# total_df["cand1_pct"].sum()
# total_df["cand2_pct"].sum()
# total_df["cand3_pct"].sum()
# total_df["cand1_actual"].sum()
# total_df["cand2_actual"].sum()
sns.set(rc={'axes.facecolor':'orange', 'figure.facecolor':'mistyrose'})

group_names=['cand1_pct','cand2_pct', 'cand3_pct']
group_size=[1031330.23,985459.39,61832.43]

subgroup_names=['cand1_actual','cand2_actual']
subgroup_size=[1119605.35410609,1056366.31309833]

a,b,c=[plt.cm.Blues, plt.cm.Reds, plt.cm.Greens]

fig, ax = plt.subplots(figsize=(18, 8))
ax.axis('equal')
mypie, _ = ax.pie(group_size, radius=1.3,labels=group_names,colors=[a(0.6), b(0.6), c(0.6)],
                 textprops={ 'color': 'deeppink','fontweight' :'bold','fontsize' : 15})
plt.setp( mypie, width=0.28, edgecolor='white')
 
mypie2, _ = ax.pie(subgroup_size, radius=1.3-0.3, labels=subgroup_names, labeldistance=0.50, 
                   colors=[a(0.6), b(0.6), c(0.6)],
                  textprops={ 'color': 'black','fontweight' :'bold','fontsize' : 15})

plt.setp( mypie2, width=0.6, edgecolor='white')
plt.title("CANDIDATE 1,2,3 PAERCENTAGES,ACTUAL" ,fontdict = {'fontsize' : 20 ,'fontweight' :'bold' } , color = "green",loc ='left')
plt.show()
In [52]:
from plotly import graph_objects as go
fig = go.Figure(layout={'plot_bgcolor':'skyblue','title':'CANDIDATE 1,2,3 PAERCENTAGES,ACTUALS & MARGIN ACTUAL'})

fig.add_trace(go.Funnel(
    name = "Candidate Percentages",
    orientation = "h",
    y = ["Cand1_pct","Cand2_pct","Cand3_pct"],
    x = [1031330.23,985459.39,61832.43],marker={'color':['#FF1493','#FF69B4','#FFB6C1']},
    textposition = "inside",
    textinfo = "percent initial"))

fig.add_trace(go.Funnel(
    name = "Candidate Actual Percentages",
    orientation = "h",
    y = ['cand1_actual','cand2_actual'],
    x = [1119605.35410609,1056366.31309833],marker={'color':['#00FFFF','#E0FFFF']},
    textposition = "inside",
    textinfo = 'percent initial'))

fig.add_trace(go.Funnel(
    name = "Margin Actual",
    orientation = "h",
    y = ["margin_actual"],
    x = [63237.96101],marker={'color':['#FFD700']},
    textposition = "inside",
    textinfo = 'percent initial'))

fig.update_layout(height=650,width=1000,font=dict(color="black",size = 15))

fig.show()
OBSERVATION From these plottings Candidate 1 actual percentage = 51% - Candidate 1 percentage = 49% difference is 2% Candidate 2 actual percentage = 48% - Candidate 2 percentage = 47% difference is 1% Candidate 3 percentage = 3% and in this Candidates more involved parties are "Republican" & "Democrat"

---------------------------------------------------------------------------------------------------------------

12. Which pollster is most "Biased" ?
In [53]:
fig = px.pie(total_df, values='bias', names='pollster',hover_data=["race","year","location"],
             color_discrete_sequence=px.colors.sequential.Jet,
             title='POLLSTER & BIAS')

fig.update_traces(textposition='inside',hole = .2) 

fig.update_layout({'paper_bgcolor': 'rgb(255,239,213)'},uniformtext_minsize=10,legend_title="POLLSTER NAMES", 
                  uniformtext_mode='hide',font=dict(color="black"))
fig.show()
OBSERVATION From these plottings "YovGov" is most biased pollster and it had highest "8.94%"

----------------------------------------------------------------------------------------------------

13. Which pollster have highest chance to conduct polling for "Win prediction" ?
In [54]:
fig = px.pie(total_df, values='rightcall', names='pollster',hover_data=["race","year","location"],
             color_discrete_sequence=px.colors.sequential.Rainbow,
             title='POLLSTER & RIGHTCALL')

fig.update_traces(textposition='inside',hole = .2) 

fig.update_layout({'paper_bgcolor': 'rgb(224,255,255)'},uniformtext_minsize=10,legend_title="POLLSTER NAMES", 
                  uniformtext_mode='hide',font=dict(color="black"))
fig.show()
In [55]:
fig = px.scatter(total_df, x="pollster",y= "rightcall",size="rightcall",color ="pollster",
                hover_data=["year"],title='POLLSTER & RIGHT CALL')

fig.update_traces(marker=dict(size=20,line=dict(width=1,color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.update_layout({'plot_bgcolor': 'rgb(245,255,250)','paper_bgcolor': 'rgb(255,215,0)'}
                 ,font=dict(color="black"),xaxis=dict(showticklabels=False)
                 ,width = 980,height = 550,showlegend = False)

fig.show()
In [56]:
fig = px.sunburst(total_df, path=['rightcall','pollster'], values='rightcall', color='pollster'
                  ,title='CANDIDATE 1 & CANDIDATE 1 PERCENTAGE')

fig.update_layout({'paper_bgcolor': 'rgb(240,255,255)'},
                 font=dict(color="black"),margin = dict(t=50, l=100, r=0, b=0))

fig.show()
OBSERVATION From these plottings "Rasmussen Reports/Pulse Opinion Research" pollster got highest pollrate 8% , so "Rasmussen Reports/Pulse Opinion Research" pollster had more chance to conduct polling for winning in 2000

---------------------------------------------------------------------------------------------------------------

14. Which pollster got highest "ERROR" ?
In [57]:
fig = px.pie(total_df, values='error', names='pollster',hover_data=["race","year","location"],
             color_discrete_sequence=px.colors.sequential.Blackbody,
             title='POLLSTER & ERROR')

fig.update_traces(textposition='inside',hole = .2) 

fig.update_layout({'paper_bgcolor': 'rgb(127,255,212)'},uniformtext_minsize=10,legend_title="POLLSTER NAMES", 
                  uniformtext_mode='hide',font=dict(color="black"))
fig.show()
In [58]:
fig = px.scatter(total_df, x="pollster",y= "error",size="error",color ="pollster",
                hover_data=["year"],title='POLLSTER & ERROR')

fig.update_traces(marker=dict(size=20,line=dict(width=1,color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.update_layout({'plot_bgcolor': 'rgb(245,255,250)','paper_bgcolor': 'rgb(255,215,0)'}
                 ,font=dict(color="black"),xaxis=dict(showticklabels=False)
                 ,width = 980,height = 550,showlegend = False)

fig.show()
OBSERVATION From these plottings In 1998 "SurveyUSA" pollster got highest error rate 8.21% , less error rate in 2006 "CNN/Opinion Research Corp"

------------------------------------------------------------------------------------------------------------

15. Which "Parisan" had more "Right call" ?
In [59]:
fig = px.pie(total_df.query("partisan != None"), values='rightcall', names='partisan',hover_data=["race","year","location"],
             color_discrete_sequence=px.colors.sequential.Blackbody,
             title='PARISAN & RIGHT CALL')

fig.update_traces(textposition='inside',hole = .2) 

fig.update_layout({'paper_bgcolor': 'rgb(127,255,212)'},uniformtext_minsize=10,legend_title="PARISAN", 
                  uniformtext_mode='hide',font=dict(color="black"))
fig.show()
In [60]:
fig = px.sunburst(total_df, path=['rightcall','partisan'], values='rightcall', color='partisan'
                  ,title='PARISAN & RIGHT CALL')

fig.update_layout({'paper_bgcolor': 'rgb(240,255,255)'},
                 font=dict(color="black"),margin = dict(t=50, l=100, r=0, b=0))

fig.show()
OBSERVATION From these plottings In 1998 "D" Partisan got high Right call (321)

-----------------------------------------------------------------------------------------------------------

16. Which Comment is repeated in which "race" , "Year" , "Location" ?
In [61]:
fig = px.pie(total_df, values='pollno', names='comment',hover_data=["race","year","location"],
             color_discrete_sequence=px.colors.sequential.YlOrRd,
             title='POLL NUMBER & COMMENT')

fig.update_traces(textposition='inside',hole = .2) 

fig.update_layout({'paper_bgcolor': 'rgb(0,0,0)'},uniformtext_minsize=10,legend_title="PARISAN", 
                  uniformtext_mode='hide',font=dict(color="white"),showlegend = False)
fig.show()
OBSERVATION From these plottings most repeated comment is "previously listed as Zogby Analytics, telephone" in race "1998_House-G_US", Year "1998_Gov-G_NY" and Location "1998_Sen-G_NY".

-----------------------------------------------------------------------------------------------------------------

17. Which pollster had highest "Sample size" ?
In [62]:
fig = px.pie(total_df, values="samplesize", names="pollster",
             title='SAMPLE SIZE vs POLLSTER',hole=.2,)
fig.update_traces(textposition='inside', textfont_color="black")  
fig.update_layout({'paper_bgcolor': 'rgb(238,232,170)'},uniformtext_minsize=10, uniformtext_mode='hide',font=dict(color="black"))
fig.show()
OBSERVATION From these plottings "Rasmussen Reports/Pulse Opinion Research" pollster got highest "Sample Size"(1,391,287) 7.27%

---------------------------------------------------------------------------------------------------------------

In [ ]: